In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random
def random_string(length,alphabet_list):
    rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
    return rand_str

def perturb(seed,alphabet_list,p=0.5):
    seq=''
    for c in seed:
        if random.random() < p: c = random.choice(alphabet_list)
        seq += c
    return seq

def make_artificial_dataset(alphabet='ACGT', motives=None, motif_length=6, 
                            sequence_length=100, n_sequences=1000, n_motives=2, p=0.2,
                           random_state=1):
    random.seed(random_state)
    alphabet_list=[c for c in alphabet]
    
    if motives is None:
        motives=[]
        for i in range(n_motives):
            motives.append(random_string(motif_length,alphabet_list))
    else:
        motif_length = len(motives[0])
        n_motives = len(motives)
    
    sequence_length = sequence_length / len(motives)
    flanking_length = (sequence_length - motif_length ) / 2
    n_seq_per_motif = n_sequences

    counter=0
    seqs=[]
    for i in range(n_seq_per_motif):
        total_seq = ''
        total_binary_seq=''
        for j in range(n_motives):
            left_flanking = random_string(flanking_length,alphabet_list)
            right_flanking = random_string(flanking_length,alphabet_list)
            noisy_motif = perturb(motives[j],alphabet_list,p)
            seq = left_flanking + noisy_motif + right_flanking
            total_seq += seq
        seqs.append(('ID%d'%counter,total_seq))
        counter += 1
    binary_skeleton = '0' * flanking_length + '1' * motif_length + '0' * flanking_length
    binary_seq = binary_skeleton * n_motives
    return motives, seqs, binary_seq

In [37]:
from eden.motif import SequenceMotif
#help(SequenceMotif)

Experimental Setup


In [53]:
#setup parameters
alphabet='ACGT'
semi_len=5
motives=['A'*semi_len+'C'*semi_len,
         #'C'*semi_len+'A'*semi_len,
         #'A'*semi_len+'T'*semi_len,
         'T'*semi_len+'A'*semi_len]
sequence_length=40
n_sequences=200
p=0.8
n_motives = 2
motif_length = 10

#make dataset
motives, seqs, binary_seqs = make_artificial_dataset(alphabet=alphabet,
                                        motives=motives,
                                        #n_motives = n_motives,
                                        #motif_length = motif_length,
                                        sequence_length=sequence_length,
                                        n_sequences=n_sequences,
                                        p=p,
                                        random_state=8)

In [ ]:


In [3]:
from utilities import Weblogo
from IPython.display import Image, display

In [63]:
img = Weblogo().create_logo(seqs)
display(Image(img))



In [41]:
# display first 10 sequences and the corresponding binary sequences
for i in range(10):
    print seqs[i][0]
    print seqs[i][1]
    print binary_seqs[i][1]
    print


>ID0
ATAGAAATACCCCCCATAGCAGCGCATTTTAAAAAGCGTG
0000011111111110000000000111111111100000

>ID1
ATAGGAATAACCCCCTTGAACTTACTTTCTAAAAAAGTAT
0000011111111110000000000111111111100000

>ID2
AAAGCAAAAACTCCTTGGTATTGATATTCTAAAAACAGCC
0000011111111110000000000111111111100000

>ID3
ACAGGAAAAACCCCGGACCATGTTATTTTTAACAACGTCT
0000011111111110000000000111111111100000

>ID4
CCGTGAAAAAGCCTCTGTGCCTGCTTTTTTAAAAACCTTG
0000011111111110000000000111111111100000

>ID5
TTTAGAGAAATCCCCGACTTTGGCCTCTTTAAGAACCGAG
0000011111111110000000000111111111100000

>ID6
GTGCTACATTCCCCCGCGTTTTGTCTTTTTAAAAACTGCG
0000011111111110000000000111111111100000

>ID7
TGACAGAATACCTGCCTCGGTCTGCTTTATAAAAAAACCC
0000011111111110000000000111111111100000

>ID8
GCGTTAAAAACCCCCGGCTCCCCGGTTTTTAGAAAAATAT
0000011111111110000000000111111111100000

>ID9
GGGGATGAAACCCACTCTCTTGCGGTTTTTAAAAACGAAA
0000011111111110000000000111111111100000


In [42]:
#display
print 'Motives and sample of their perturbed variants:'
alphabet_list=[c for c in alphabet]
for motif in motives: 
    print
    print 'true motif:', motif, ' noisy: ',
    for i in range(4):
        print perturb(motif,alphabet_list,p=p),


Motives and sample of their perturbed variants:

true motif: AAAAACCCCC  noisy:  TCAAACCCCC ATATACCCCC AAAAACCCCC TAAAACCCCC
true motif: TTTTTAAAAA  noisy:  TTTATAAAAA TTTTTAAAAA TTTTTCAAAA TTTTTAAAAG

In [4]:
from sklearn.cluster import KMeans
from eden_wrapper import EdenWrapper
from meme_wrapper import Meme
from smod_wrapper import SMoDWrapper

In [44]:
# Parameters for tools

alphabet = 'dna'    # Sequence type
scoring_criteria = "pwm"    # ["pwm", "hmm"]
nmotifs = 4    # No. of motives to be found
minw = len(motives[0])-2    # Minimum width of motif
maxw = len(motives[0])+2    # Maximum width of motif

from utilities import Weblogo
wl = Weblogo(output_format='png',
             sequence_type='dna',
             resolution=200,
             stacks_per_line=40,
             units='probability')

For EDeN:


In [9]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)

In [10]:
%%time
km = KMeans(n_clusters=nmotifs)
tool = EdenWrapper(alphabet=alphabet,
                   scoring_criteria = scoring_criteria,
                   complexity=5, 
                   nbits=14, 
                   negative_ratio=3,
                   min_subarray_size=minw, 
                   max_subarray_size=maxw,
                   clustering_algorithm=km,
                   
                   weblogo_obj=wl)
tool.fit(seqs)


Positive data: Instances: 200 ; Features: 16385 with an avg of 5662 features per instance
Negative data: Instances: 600 ; Features: 16385 with an avg of 5877 features per instance
Elapsed time: 44.9 secs
model induction: 200 positive instances 46 s
motives extraction: 246 motives in 13s
motives clustering: 4 clusters in 1s
after filtering: 238 motives 4 clusters in 0s
motif model construction in 0s
updated motif counts in 0s
CPU times: user 29.7 s, sys: 2.73 s, total: 32.4 s
Wall time: 1min 3s

For MEME:


In [94]:
%%time
# save input sequences as fasta file for MEME tool
with open('seqs.fa','w') as f_train:
    for seq in seqs:
        f_train.write('>' + seq[0] + ' \n')
        f_train.write(seq[1] + '\n')

tool =  Meme(alphabet='dna',
             scoring_criteria = scoring_criteria,
             minw=minw,
             maxw=maxw,
             nmotifs=nmotifs,
             
             weblogo_obj=wl)
tool.fit('seqs.fa')


CPU times: user 104 ms, sys: 40 ms, total: 144 ms
Wall time: 8.64 s

For SMoD:


In [85]:


In [54]:
tool = SMoDWrapper(scoring_criteria = 'pwm',
                   #complexity = complexity,
                   #n_clusters = n_clusters,
                   min_subarray_size = 8,
                   max_subarray_size = 12,
                   pos_block_size = 20,
                   neg_block_size = 20,
                   clusterer = KMeans(),
                   min_score = 7,
                   min_freq = 0.69,
                   min_cluster_size = 1,
                   similarity_th = 0.5,
                   freq_th = 0.69,
                   p_value=1,
                   regex_th=0.92,
                   #sample_size=sample_size,
                   std_th=0.48
                   )
from eden.modifier.seq import seq_to_seq, shuffle_modifier
neg_seqs = seq_to_seq(seqs, modifier=shuffle_modifier, times=1, order=2)
neg_seqs = list(neg_seqs)

block_size=n_sequences/8

pos_size = len(seqs)
train_pos_seqs = seqs[:pos_size/2]
test_pos_seqs = seqs[pos_size/2:]

neg_size = len(neg_seqs)
train_neg_seqs = neg_seqs[:neg_size/2]
test_neg_seqs = neg_seqs[neg_size/2:]
tool.fit(seqs, neg_seqs)
#smod.display_logo()

In [55]:
tool.display()


        0      1      2      3      4      5      6      7      8      9     10
-:   0.50   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
A:   0.00   0.50   0.00   0.00   0.00   1.00   0.00   0.00   0.00   0.00   1.00
C:   0.50   0.00   0.00   1.00   1.00   0.00   0.00   1.00   1.00   0.50   0.00
G:   0.00   0.50   1.00   0.00   0.00   0.00   1.00   0.00   0.00   0.00   0.00
T:   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.50   0.00

        0      1      2      3      4      5      6      7      8      9     10     11
-:   0.00   0.50   0.50   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.50   0.50
A:   1.00   0.00   0.00   0.00   0.00   0.00   0.00   1.00   0.00   0.00   0.00   0.50
C:   0.00   0.00   0.50   0.00   1.00   0.00   0.00   0.00   0.00   1.00   0.50   0.00
G:   0.00   0.50   0.00   1.00   0.00   0.00   0.00   0.00   1.00   0.00   0.00   0.00
T:   0.00   0.00   0.00   0.00   0.00   1.00   1.00   0.00   0.00   0.00   0.00   0.00


In [99]:
% matplotlib inline

img = Weblogo().create_logo(seqs)
display(Image(img))

import pylab as plt 
import numpy as np
sig = np.zeros(sequence_length)

print "Average over all sequences"
for j in range(len(motives)):
    for i in range(len(seqs)):
        score = tool.score(motif_num=j+1, seq=seqs[i][1])
        sig += np.array(score)
    sig /= float(len(seqs))
    #sig *= len(motives)
    plt.figure(figsize=(16,3))
    plt.fill_between(range(len(sig)), 0, sig, alpha=0.3)
    plt.show()


print
print "for individual sequences"

for i in range(len(seqs[:10])):
    sig = np.zeros(sequence_length)
    for j in range(len(motives)):
        score = tool.score(motif_num=j+1, seq=seqs[i][1])
        sig += np.array(score)
    sig /= float(len(seqs))
    sig *= len(motives)
    plt.figure(figsize=(16,3))
    plt.fill_between(range(len(sig)), 0, sig, alpha=0.3)
    plt.xticks(range(len(sig)), seqs[i][1])
    plt.show()


Average over all sequences
for individual sequences

In [21]:
tool.display_logo(do_alignment=True)


Scoring Methods Comparison

Scoring with PWM


In [22]:
%%time
test_seq = seqs[0][1]
score_pwm = tool.score(motif_num=1, seq=test_seq)
print score_pwm


[2.081911965929634e-14, 5.2757524291839265e-17, 1.0210469211002624e-17, 2.8777142743138093e-15, 5.2145794967159266e-18, 5.615698030490886e-21, 3.602512882618652e-22, 9.640745945565149e-23, 1.1174093019854318e-23, 4.0787457165811803e-25, 3.722871480469391e-25, 9.495673161158559e-23, 1.836976780861523e-26, 9.290153920647594e-24, 7.548250060526172e-22, 1.0204049926802083e-20, 8.921518025125857e-21, 1.652408455811946e-19, 4.8491074442448796e-21, 1.4209244098238678e-19, 1.3803607857838013e-21, 1.2341076640677424e-18, 5.7305133242869754e-21, 2.420942407245905e-21, 5.0675363615126264e-24, 6.332576766121355e-22, 3.745683831062127e-21, 9.316412825096773e-21, 1.300714081347494e-19, 8.614118519408e-23, 1.1401423637962047e-22, 3.28220762044957e-22, 1.4047201268072436e-21, 5.06678237604e-25, 2.104525862151618e-20, 1.172649303006197e-22, 1.4688560114468176e-22, 3.576057282891844e-24, 3.387054982968658e-23, 2.842993058897474e-22, 1.5673810991565905e-21, 1.8607014460570877e-21, 1.804852813800267e-20, 3.366863080423703e-21, 5.97054031273854e-20, 4.947921622418543e-19, 2.272779950020669e-18, 3.140060064095691e-18, 1.7428081723501686e-19, 1.4287605625151316e-17, 4.403462307740887e-18, 5.547034448248737e-19, 1.3140918112544755e-19, 1.3633069830893101e-17, 1.8941013581748018e-19, 7.254443844030335e-17, 1.9277282358016904e-19, 1.1666243175922835e-20, 3.185387214225236e-20, 2.5101694102077552e-20, 2.3350015970062034e-20, 8.073555803192883e-19, 6.044157458117578e-21, 1.025671119895422e-23, 8.195461908573478e-21, 3.906628217882047e-18, 1.2269585777895515e-17, 5.812301427276959e-20, 5.120206393695582e-22, 2.041286640576183e-22, 2.9792452179629514e-20, 4.743482265246865e-22, 4.0203219291551324e-20, 7.464273979115359e-23, 7.99380648771001e-23, 2.3050656790958667e-22, 1.8036334102026236e-20, 1.2542074656831717e-20, 1.556769391270943e-19, 6.4237274977320245e-18, 1.955208394326892e-16, 1.0098606892419099e-19, 7.875802126061102e-23, 2.560801485945777e-17, 3.3629299079856486e-19, 5.767664178330817e-21, 5.68685592219878e-22, 1.0150944245881196e-19, 1.6393637246760211e-22, 1.2261067726521358e-22, 8.20586275519323e-21, 6.319285549734e-19, 7.081710327886467e-21, 1.9464690237207195e-17, 3.5680955663660106e-16, 8.493003783713901e-16, 1.2219371118916824e-16, 4.575082103336705e-16, 4.3415058391182655e-14, 1.1416860364072179e-11, 1.4003201826714151e-15, 6.141527714740993e-16, 6.122784068836026e-20, 1.5635046855190377e-16, 2.2072393189037957e-17, 2.4359555152766432e-18, 1.1297544000371727e-18, 2.484134482576455e-18, 1.0509215124970699e-23, 2.8464820711708e-20, 4.222506216996585e-20, 1.8094085312511865e-18, 1.9476840706388936e-21, 8.208499999599901e-23, 2.584368030785952e-20, 2.1853788065953597e-22, 9.416256617834473e-24, 8.566288731000005e-24, 2.303788265159062e-25, 5.298944236523437e-28, 6.848885425706544e-26, 1.2887757800190003e-25, 2.4214641496919994e-23, 6.969730530360354e-21, 5.361510394127919e-18, 6.005995368885235e-15, 4.542687736721545e-12, 2.0748726237475664e-10, 1.5024449078930943e-07, 0.00017531905199324804, 4.801081356075259e-06, 2.1950515452813172e-07, 1.05333575943773e-09, 1.450875701704862e-12, 1.7239823717279277e-14, 1.2739251086927107e-17, 3.3531432564956114e-19, 3.7398224272102506e-20, 2.749719664905963e-22, 1.2633477028740068e-21, 5.352706701341997e-25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
CPU times: user 12 ms, sys: 4 ms, total: 16 ms
Wall time: 12.3 ms

In [23]:
import matplotlib.pyplot as plt
plt.plot(score_pwm)
plt.xlabel('Sequence Position')
plt.ylabel('Score')
plt.show()



In [28]:
from sklearn.metrics import roc_auc_score

true_score = [float(int(x)) for x in binary_seqs[0][1]]
roc_score = roc_auc_score(true_score, score_pwm)
print "ROC Score: ", roc_score


ROC Score:  0.489820075758

Scoring with HMM


In [32]:
%%time
km = KMeans(n_clusters=nmotifs)
tool = EdenWrapper(alphabet=alphabet,
                   scoring_criteria = "hmm",
                   complexity=5, 
                   nbits=14, 
                   negative_ratio=3,
                   min_subarray_size=5, 
                   max_subarray_size=7,
                   clustering_algorithm=km,
                   
                   weblogo_obj=wl)
tool.fit(seqs)


Positive data: Instances: 200 ; Features: 16385 with an avg of 5662 features per instance
Negative data: Instances: 600 ; Features: 16385 with an avg of 5877 features per instance
Elapsed time: 57.6 secs
model induction: 200 positive instances 59 s
motives extraction: 90 motives in 15s
motives clustering: 4 clusters in 0s
after filtering: 17 motives 4 clusters in 0s
motif model construction in 0s
updated motif counts in 0s
CPU times: user 39.5 s, sys: 5.07 s, total: 44.6 s
Wall time: 1min 18s

In [33]:
%%time
test_seq = seqs[0][1]
score_mm = tool.score(motif_num=1, seq=test_seq)
print score_mm


[-230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -25.619725748245855, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -21.217810050974293, -15.999216980991333, -19.451606706848878, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -8.67360002713884, -14.06457789299254, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -2.7732838728205036, -2.7732838728205036, -2.7732838728205036, -4.481772403227368, -8.67360002713884, -14.06457789299254, -19.31252354707684, -23.643434330760346, -25.619725748245855, -25.619725748245855, -25.619725748245855, -25.619725748245855, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, -230.25850929940458, 0, 0, 0, 0, 0, 0]
CPU times: user 72 ms, sys: 4 ms, total: 76 ms
Wall time: 77.1 ms

In [34]:
import matplotlib.pyplot as plt
plt.plot(score_mm)
plt.xlabel('Sequence Position')
plt.ylabel('Score')
plt.show()



In [35]:
true_score = [float(int(x)) for x in binary_seqs[0][1]]
roc_score = roc_auc_score(true_score, score_mm)
print "ROC Score: ", roc_score


ROC Score:  0.582386363636

In [ ]: